#! /usr/bin/env python
# -*- coding: utf-8 -*-

# *************************************************************
#     Filename @  test.py
#       Author @  Huoty
#  Create date @  2015-06-25 10:08:02
#  Description @  
# *************************************************************

import urllib
import urllib2
import re

page = 1
url = "http://www.qiushibaike.com/8hr/page/" + str(page)
reg = '<div class="article block untagged mb15".*?>.*?<div class="author">.*?<a.*?>.*?<img.*?>(.*?)</a>.*?<div class="content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats">.*?<i class="number">(.*?)</i>'

# headers 验证
user_agent = "Mozilla/5.0 (X11; Linux i686)"
headers = {"User-Agent": user_agent}

# Script starts from here

try:
    request = urllib2.Request(url, headers = headers)
    response = urllib2.urlopen(request)
    #print response.read()

    content = response.read().decode("utf-8")
    pattern = re.compile(reg, re.S)
    items = re.findall(pattern, content)
    for item in items:
        haveMedia = re.search("img|video", item[3])
        if not haveMedia:
            print u"发布作者:%s\t发布时间:%s\t点赞数:%s\n" % (item[0].strip(), item[2].strip(), item[4].strip()),
            print item[1].strip() + "\n\n"
            #print item[0], item[1], item[2], item[4]

except urllib2.URLError, e:
    if hasattr(e, "code"):
        print e.code

    if hasattr(e, "reason"):
        print e.reason
